# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（北京）
#红色网站(http://qyxy.baic.gov.cn/)
#维护黄羽
'''

import re
from bs4 import BeautifulSoup
from utils import kill_captcha
import random
import requests
from scpy.logger import get_logger
import time
import urllib
import sys

reload(sys)
sys.setdefaultencoding('utf8')

logger = get_logger(__file__)

UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"


def get_year_report_info(companyName):
    """
    获取公司年报网页
    当验证码错误，或者验证码服务出现错误时，重复下载验证码并破解;
    在下载网页的过程中对方服务出现错误,重新该下载网页(目前的方式是重新破解验证码，重新下载)
    :param companyName: 公司名字或者注册号
    :return:None 或者　空list(即[]) 或者包含tuple的list　
    若公司不存在,返回None;
    若公司存在但年报不存在,返回[];
    若年报存在返回年报,返回的类型list,list里面是一个包含年份和html源码的tuple,即格式为[('2013', 'html'),('2014', 'html')...]
    """
    year_report_res_html = ''
    year_report_res_html_list = []
    index_url = 'http://qyxy.baic.gov.cn/'
    index_header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'qyxy.baic.gov.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': UserAgent,
    }

    index_req = requests.session()
    index_req.headers = index_header
    index_res = index_req.get(index_url, timeout=200).content

    simple_href = re.findall('parent\.window\.location\.href.*?=.*?"(.*?)";', index_res)

    if simple_href:
        simple_href = simple_href[0]
    else:
        logger.error("首页获取链接失败,网站发生变化！")
        raise Exception("首页获取链接失败,网站发生变化！")

    simple_req = index_req
    simple_res = simple_req.get(simple_href, timeout=200).content
    credit_ticket = re.findall('var credit_ticket.*?=.*?"(.*?)";', simple_res)
    current_time_millis = re.findall('currentTimeMillis=(\d+?)&', simple_res)
    if credit_ticket and current_time_millis:
        credit_ticket = credit_ticket[0]
        current_time_millis = current_time_millis[0]
    else:
        logger.error("simple网页获取链接失败,网站发生变化！")
        raise Exception("simple网页获取链接失败,网站发生变化！")

    img_headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'qyxy.baic.gov.cn',
        'Referer': simple_href,
        'User-Agent': UserAgent,
    }
    # current_time_millis = int(time.time()*1000)
    img_url = 'http://qyxy.baic.gov.cn/CheckCodeYunSuan?currentTimeMillis=%s' % current_time_millis
    img_req = index_req
    img_req.headers = img_headers
    try:
        captcha = img_req.get(img_url, timeout=200).content
    except Exception, e:
        logger.error("从网站下载验证码失败！重复下载！")
        logger.error(e)
        raise Exception("download captcha error")
    if not captcha:
        logger.error("从网站下载验证码为空！重复下载！")
        return ''

    with open('./bj_new.jpg', 'wb') as fp:
        fp.write(captcha)

    try:
        res_code = kill_captcha(captcha, 'bj', 'jpeg')
        # print 'res code: ', res_code
    except Exception, e:
        logger.error("破解验证码的服务出现异常")
        logger.error(e)
        raise e
    # if not res_code and len(res_code) > 100:
    #     logger.info('验证码为:%s' % res_code)
    #     logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
    #     return ''   # 返回空字符串，用于重复破解
    res_code = raw_input('year report code=')

    check_headers = {
        'Accept': 'text/plain, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Content-Length': '0',
        'Host': 'qyxy.baic.gov.cn',
        'Origin': 'http://qyxy.baic.gov.cn',
        'Referer': simple_href,
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': UserAgent,
    }
    check_query = {
        'check_code': res_code,
        'currentTimeMillis': current_time_millis,
        'random': str(int(random.random() * 100000)),
    }

    check_url = 'http://qyxy.baic.gov.cn/login/loginAction!checkCode.dhtml?' + urllib.urlencode(check_query)

    check_req = img_req
    check_req.headers = check_headers
    check_res = check_req.post(check_url, timeout=100).content
    logger.info("网站返回：%s", check_res)
    if check_res == 'true':
        logger.info("验证码正确！网站返回：%s", check_res)
    # else:
    #     logger.error("验证码破解错误，重复破解！")
    #     return ''

    elif check_res == 'false' or re.findall('访问异常', check_res):
        logger.error("验证码破解错误或访问异常，延时，重复破解！")
        time.sleep(random.random())
        # logger.error("验证码破解错误，重复破解！")
        return ''
    else:
        logger.error("网页发生变化！")
        raise Exception("网页发生变化！")
    com_query = {
        'currentTimeMillis': current_time_millis,
        'credit_ticket': credit_ticket,
        'check_code': res_code,
    }
    com_dict = {
        'queryStr': companyName,
        'module': '',
        'idFlag': 'qyxy',
    }
    com_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '173',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Host': 'qyxy.baic.gov.cn',
        'Origin': 'http://qyxy.baic.gov.cn',
        'Referer': simple_href,
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': UserAgent,
    }
    com_list_url = 'http://qyxy.baic.gov.cn/lucene/luceneAction!NetCreditLucene.dhtml?'
    # com_list_url = 'http://qyxy.baic.gov.cn/lucene/luceneAction!NetCreditLucene.dhtml?'+urllib.urlencode(com_query)
    com_req = check_req
    com_req.headers = com_headers
    com_list_html = com_req.post(com_list_url, data=com_dict, timeout=20, params=com_query).content

    if '非法访问错误页面' in str(BeautifulSoup(com_list_html, 'html5lib')):
        raise Exception("网站返回:'非法访问错误页面' 信息！")

    if '您可能频繁重复请求' in str(BeautifulSoup(com_list_html, 'html5lib')):
        time.sleep(1)
        return ''

    com_url = re.findall('onclick="showDialog\(\'(.*?qyxq_view.*?)\',', com_list_html)
    reg_bus_ent_id = re.findall('reg_bus_ent_id=(.*?)&', com_list_html)

    if com_url and reg_bus_ent_id:
        logger.info("搜索的公司存在！")
        com_url = com_url[0]
        reg_bus_ent_id = reg_bus_ent_id[0]
    else:
        logger.info("搜索的公司不存在！")
        return None

    com_url = 'http://qyxy.baic.gov.cn' + com_url

    asic_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'qyxy.baic.gov.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': UserAgent,
    }

    logger.info("正在进入工商主页！")
    asic_req = com_req
    asic_req.headers = asic_headers
    try:
        base_res = asic_req.get(com_url).content
    except Exception, e:
        logger.info(e)
        raise e
    # year_report_res_html += base_res

    from table import index, report_index, table_clean, parse_time
    import bj_parse
    base_table = table_clean(base_res, "名称")
    bj_parse.base(base_table)
    base_table = table_clean(base_res, "名称")

    root_url = "http://qyxy.baic.gov.cn"

    # 投资人信息,即股东信息
    share_holder_url = re.findall("(/xycx/queryCreditAction!tzrlist_hs_all.*?)\'", base_res, re.S)
    share_holder_url = share_holder_url[0] if share_holder_url else ""
    ent_page = re.findall("ent_page=(.*?)&", share_holder_url)
    ent_page = ent_page[0] if ent_page else "0"
    share_holder_html = ""
    if share_holder_url:
        share_holder_url = root_url + share_holder_url
        share_holder_params = {
            "reg_bus_ent_id": reg_bus_ent_id,
            "ent_page": ent_page,
            "moreInfo": "",
            "newInv": "newInv",
            "fqr": "",
            "SelectPageSize": "50",
            "EntryPageNo": "1",
            "pageNo": "1",
            "pageSize": "50",
            "clear": "true",
        }
        share_holder_html = asic_req.get(share_holder_url, params=share_holder_params).content
        share_holder_num = re.findall("记录总数(\d+?)条", share_holder_html)
        share_holder_num = int(share_holder_num[0]) if share_holder_num else 0
        if share_holder_num > 50:
            share_holder_params["SelectPageSize"] = share_holder_num
            share_holder_params["pageSize"] = share_holder_num
            share_holder_html = asic_req.get(share_holder_url, params=share_holder_params).content

    share_holder_table = table_clean(share_holder_html, "投资")
    bj_parse.share_holder(share_holder_table)

    # raw_base_html["shareHolder"] = share_holder_html

    # 主要人员详细
    member_url = re.findall("(/xycx/queryCreditAction!queryTzrxx_all.*?)\'", base_res, re.S)
    member_url = member_url[0] if member_url else ""
    member_html = ""
    if member_url:
        member_url = root_url + member_url
        member_params = {
            "reg_bus_ent_id": reg_bus_ent_id,
            "moreInfo": "",
            "SelectPageSize": "50",
            "EntryPageNo": "1",
            "pageNo": "1",
            "pageSize": "50",
            "clear": "true",
        }
        member_html = asic_req.get(member_url, params=member_params).content
        member_num = re.findall("记录总数(\d+?)条", member_html)
        member_num = int(member_num[0]) if member_num else 0
        if member_num > 50:
            member_params["SelectPageSize"] = member_num
            member_params["pageSize"] = member_num
            member_html = asic_req.get(member_url, params=member_params).content
    # raw_base_html["member"] = member_html

    member_table = table_clean(member_html, "序号")
    bj_parse.person(member_table)


    # 变更信息
    alter_url = root_url + "/newChange/newChangeAction!bgxx_view.dhtml?"
    alter_params = {"reg_bus_ent_id": reg_bus_ent_id}
    alter_html = asic_req.get(alter_url, params=alter_params).content

    alter_list = re.findall("(/newChange/newChangeAction!bgxxInfoUrl.*?)\'", alter_html, re.S)
    alter_html_list = []
    for item in alter_list:
        alter_url = root_url + item
        alter_html = asic_req.get(alter_url, params=alter_params).content
        alter_html_list.append(alter_html)
    # raw_base_html["alter"] = alter_html_list

    tables = re.findall("<table.*?table>", alter_html_list[0], re.S)
    import pdb
    pdb.set_trace()

    nb_url = 'http://qyxy.baic.gov.cn/entPub/entPubAction!getTabForNB_new.dhtml?'  # 企业年报, 勿删
    # nb_url = 'http://qyxy.baic.gov.cn/bjgtgsh/gtgsh_bjAction!gtgshxx.dhtml?'      # 个体工商户年报, 勿删
    # nb_url = 'http://qyxy.baic.gov.cn/bjgtgsh/gtgsh_bjAction!nznbFrame.dhtml?'    # 农民专业合作社年报, 勿删
    nb_query = {
        'entId': reg_bus_ent_id,
        'flag_num': '0',
        'clear': 'true',
        'timeStamp': str(int(random.random() * 100000)),
    }

    # 企业基本信息
    logger.info("正在获取企业年报！")
    nb_res = asic_req.get(nb_url, params=nb_query).content
    nb_items = re.findall('onclick="showDialog\(\'(.*?)\',.*?>(\d+)年度', nb_res)

    if not nb_items:
        logger.info("没有年报信息！")
        return []

    for item in nb_items:
        # 企业年报主页
        logger.info("正在获取企业年报主页！\t%s" % item[1])
        a_year_url = 'http://qyxy.baic.gov.cn' + item[0]
        a_year_res = asic_req.get(a_year_url).content

        year_report_res_html += a_year_res

        # 网站或网店信息
        logger.info("正在获取企业年报网站或网店信息！\t%s" % item[1])
        wz_url = 'http://qyxy.baic.gov.cn/entPub/entPubAction!wz_bj.dhtml?'
        cid = re.findall('cid=(.*?)&', a_year_url)[0]
        wz_query = {
            'clear': 'true',
            'cid': cid
        }
        a_year_wz_res = asic_req.get(wz_url, params=wz_query).content
        year_report_res_html += a_year_wz_res

        # 股东及出资信息
        logger.info("正在获取企业年报股东及出资信息！\t%s" % item[1])
        gd_url = 'http://qyxy.baic.gov.cn/entPub/entPubAction!gdcz_bj.dhtml?'
        gd_query = {
            'clear': 'true',
            'cid': cid,
            'entnature': '',
        }
        a_year_gd_res = asic_req.get(gd_url, params=gd_query).content
        year_report_res_html += a_year_gd_res

        # 对外提供保证担保信息
        logger.info("正在获取企业年报对外提供保证担保信息！\t%s" % item[1])
        dw_url = 'http://qyxy.baic.gov.cn/entPub/entPubAction!qydwdb_bj.dhtml?'
        dw_query = {
            'clear': 'true',
            'cid': cid,
            'entnature': '',
        }
        a_year_dw_res = asic_req.get(dw_url, params=dw_query).content
        year_report_res_html += a_year_dw_res

        # 修改记录
        logger.info("正在获取企业年报修改记录！\t%s" % item[1])
        bg_url = 'http://qyxy.baic.gov.cn/entPub/entPubAction!qybg_bj.dhtml?'
        bg_query = {
            'clear': 'true',
            'cid': cid,
            'year': item[1],
        }
        a_year_bg_res = asic_req.get(bg_url, params=bg_query).content
        year_report_res_html += a_year_bg_res

        year_report_res_html_list.append((item[1], year_report_res_html,))

    return year_report_res_html_list


def year_report_run(companyName, MAXTIME=40):
    """
    下载年报的控制部分.
    当验证码错误，或者验证码服务出现错误时，重复下载验证码并破解;
    重复破解次数为MAXTIME次,超过这个次数后,还没有破解成功的话,抛出异常.
    :param companyName:公司名字或或者工商号.
    :param MAXTIME:最多次数
    :return:
    若公司不存在,返回None;
    若公司存在但年报不存在,返回[];
    若年报存在返回年报,返回的类型list,list里面是一个包含年份和html源码的tuple,即格式为[('2013', 'html'),('2014', 'html')...]
    """
    res = ''
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res is []:  # 公司没有年报信息
            return []
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = get_year_report_info(companyName)
            except Exception, e:
                import traceback
                traceback.print_exc(e)
                raise e
        else:
            # a tuple including year and html
            return res
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)


if __name__ == "__main__":
    companyName = '百度在线网络技术（北京）有限公司'
    year_report_tuple = year_report_run(companyName)
    print year_report_tuple
    # import pdb
    # pdb.set_trace()
